# topic 22 # This will be remarkably similar to the work in # topic 21, so much so that it is worth examining the # two to see the changes. # First, set up the situation. We have two populations # with a unknown standard deviations. # source("../gnrnd5.R") source("../gnrnd4.R") gnrnd5( 156823499902, 933002363) pop_one <- L1 gnrnd5( 177843499903, 1143001693) pop_two <- L1 ################################################ ### Confidence Interval ### ################################################ # We want to construct a 95% confidence interval # for the difference of the means, mu_one - mu_two. # To do this we will take random samples of both, # the size of the sample from pop_one will be 38 # and the size of the sample from pop_two will # be 43. n1 <- 38 n2 <- 43 # for now, we will get the samples by generating the # appropriate random index numbers for the two # populations. # get the sample and sample statistics from pop_one gnrnd4( 982043701, 500000001) L1 index_one <- L1 samp_one <- pop_one[ index_one ] samp_one x_bar_one <- mean( samp_one ) x_bar_one # get the sample and sample statistics from pop_two gnrnd4( 338064201, 500000001) L1 index_two <- L1 samp_two <- pop_two[ index_two ] samp_two x_bar_two <- mean( samp_two ) x_bar_two #### Therefore we can find the difference of the sample means diff_of_means <- x_bar_one - x_bar_two diff_of_means #### This is an instance where we do not know the standard deviation #### of the underlying populations. Therefore, we will use #### the Student's-t distribution as the model for the distribution #### of the difference of the sample means. As part of #### this we will find the standard error by using the #### standard deviations of the samples: #### sd_of_diff = #### sqrt( sd(samp_one)^2/n1 + sd(samp_two)^2/n2) for samples #### of size n1 and n2, respectively. ## we took our samples, (see above) so we can compute ## sd_of_diff <- sqrt( sd(samp_one)^2/n1 + sd(samp_two)^2/n2) sd_of_diff # for a 95% confidence interval, we are missing 5% total, # and we need to split this in half, a 2.5% for the # top and bottom. However, since this is a Student's-t # distribution we need to determine the degrees of # freedom. ######################### quick and dirty # The quick and dirty way is to use one less than the # size of the smaller sample. In our case, that would # be 38-1 or 37 degrees of freedom. # # get the t value, from a Student's-t with 37 degrees # of freedom and with 2.5% above it t <- qt( 0.025, 37, lower.tail=FALSE) t # then our margin of error is MOE <- t*sd_of_diff MOE # so our confidence interval goes from diff_of_means - MOE # to' diff_of_means + MOE # That pattern of commands would be the same for each # instance of such a problem. We have a function # that does this. It is called ci_2known(). source("../ci_2unknown.R") ci_2unknown(sd(samp_one), n1, x_bar_one, sd(samp_two), n2, x_bar_two, 0.95) #### we find the values that we computed above under the #### Simp Low, Simp High Simp MOE DF Simp and t Simp #### labels. #### but we recall that there was a more complex way to #### determine the degrees of freedom. We can do that here: #compute the degrees of freedom d_one <- sd( samp_one )^2/n1 d_two <- sd( samp_two)^2/n2 d_one d_two df <-(d_one+d_two)^2/( d_one^2/(n1-1)+d_two^2/(n2-1)) # full computed degrees of freedom df # Now, with our new degrees of freedom we can repeat # our earlier steps. # # get the t value, from a Student's-t with 37 degrees # of freedom and with 2.5% above it t <- qt( 0.025, df, lower.tail=FALSE) t # then our margin of error is MOE <- t*sd_of_diff MOE # so our confidence interval goes from diff_of_means - MOE # to' diff_of_means + MOE # If we run the function again we can compare # these "full" or "computed" results with those # from the function. ci_2unknown(sd(samp_one), n1, x_bar_one, sd(samp_two), n2, x_bar_two, 0.95) ################################################ ### Hypothesis test ### ################################################ # Now, someone says that they believe that the # difference of the means from pop_one and pop_two # mu_1 - mu_2 = 0, that the means are the same. # Thus our null hypothesis, H0, is that the difference is 0. # Our alternative hypothesis, H1, is that the # difference of the the means is greater than 0, or # that mu_one > mu_two. # We want to see if we can reject H0 in favor of H1. # We will get a sample from each # population, n1=38 and n2=43. # We are willing to be wrong in telling the # person that they are wrong 2.5% of the time! ######################## ## The critical value approach. We know that two ## samples of size 38 and 43, respectively, will have ## a standard error of the difference of the means ## be sqrt( sigma_one^2/28 + sigma_two^2/35 ) and that ## would be a normal distribution. Unfortunately, ## we do know neither sigma_one nor sigma_two. The ## best we can do is to use sd(samp_one) and ## sd( samp_two ) to approximate those and in that ## case we need to use a Student's-t distribution ## rather than a normal distribution. # #### we computed this above, but let us do it again ### sd_of_diff <- sqrt( sd(samp_one)^2/n1 + sd(samp_two)^2/n2) sd_of_diff ## Now to use the Student's-t we need to decide on ## the degrees of freedom. ######################### quick and dirty # The quick and dirty way is to use one less than the # size of the smaller sample. In our case, that would # be 38-1 or 37 degrees of freedom. # # get the t value, from a Student's-t with 37 degrees # of freedom and with 2.5% above it t_val <- qt( 0.025, 37, lower.tail=FALSE) t_val # Because the null hypothesis is that the difference is 0 # The critical high value will be 0+t_val*sd_of_diff or t_val*sd_of_diff # our critical high value # we compare the difference of the means to that critical value diff_of_means # that difference is not greater than our critical high. Therefore, # we would not reject H0 in favor of H1. ###################### ## The attained significance approach asks, if H0 is true, ## then how strange would it be to get a difference of the ## means at this value or at any stranger value. But ## that is just a pt command. Ho0wever, because pt() ## does not allow us to specify the standard deviation we ## need to normalize our value test_val <- diff_of_means / sd_of_diff test_val pt( test_val, 37, lower.tail=FALSE) # # That attained significance is not less than our given # level of significance, 0.025, so we would not reject H0 # in favor of H1. ################################# ## All of that was done using the "quick and dirty" ## determination of the degrees of freedom. We have ## the computed degrees of freedom which we did ## above but we sill do it again here: d_one <- sd( samp_one )^2/n1 d_two <- sd( samp_two)^2/n2 d_one d_two df <-(d_one+d_two)^2/( d_one^2/(n1-1)+d_two^2/(n2-1)) # full computed degrees of freedom df # Now, with our new degrees of freedom we can repeat # our earleir steps. ### the critical value approach # # get the t value, from a Student's-t with df degrees # of freedom and with 2.5% above it t_val <- qt( 0.025, df, lower.tail=FALSE) t_val # Because the null hypothesis is that the difference is 0 # The critical high value will be 0+t_val*sd_of_diff or t_val*sd_of_diff # our critical high value # we compare the difference of the means to that critical value diff_of_means # that difference is not greater than our critical high. Therefore, # we would not reject H0 in favor of H1. ###################### ## The attained significance approach asks, if H0 is true, ## then how strange would it be to get a difference of the ## means at this value or at any stranger value. But ## that is just a pt command. However, because pt() ## does not allow us to specify the standard deviation we ## need to normalize our value test_val <- diff_of_means / sd_of_diff test_val pt( test_val, df, lower.tail=FALSE) # # That attained significance is not less than our given # level of significance, 0.025, so we would not reject H0 # in favor of H1. ######################### ## Of course, for any similar problem we would be doing ## the same steps, with the additional complication of ## having to pay attention to the alternative. If H1 is ## mu_one < mu_two, which is the same as mu_one-mu_two<0, ## then we would get a critical low value and our ## pnorm would use the lower tail. If, H1 is ## mu_one != mu_two, which is the same as ## mu_one-mu_two != 0, then we would need to get both a ## critical low and a critical high, splitting the area ## of significance between the low and high. In the ## same situation, once we get our attained value on ## one side, high or low, we would need to double it to ## account for values that are just as extreme or more ## extreme on the other side. ## ## Rather than think out all of these issues each time ## we run into this problem, we might as well capture ## all of this in a function, hypoth_2test_unknown(). ## LEt us load and run that for the values in our example. source( "../hypo_2unknown.R") hypoth_2test_unknown( sd( samp_one), n1, x_bar_one, sd( samp_two), n2, x_bar_two, 1, 0.025 ) ###################################################### ###################################################### ## Of course, we really could have found the means ## of the two populations. source("../pop_sd.R") mu_one <- mean( pop_one ) mu_one mu_two <- mean( pop_two ) mu_two real_diff <- mu_one - mu_two real_diff # and for that matter we could find the # two population standard deviations sigma_one <- pop_sd( pop_one ) sigma_one sigma_two <- pop_sd( pop_two) sigma_two # we will run through 10000 cases where we # get samples of size 38 and 43 and we will # use those samples,specifically the sample # means, to generate a 95% confidence interval # for the difference of the population means. # In the process we will see how many of those # intervals contain the true difference. L1 <- 1:10000 for( i in 1:10000 ) { samp_one <- sample( pop_one, 38 ) samp_two <- sample( pop_two, 43 ) this_interval <- ci_2unknown( sd(samp_one), 38, mean( samp_one), sd(samp_two), 43, mean( samp_two), 0.95 ) if( this_interval[1]